{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "27381a67",
   "metadata": {},
   "source": [
    "Appendix B, table B3, B4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fb3c3356",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "========================================\n",
      "Processing country: ETH (Ethiopia)\n",
      "========================================\n",
      "Successfully read file. Shape: (11074, 1112)\n",
      "First 10 columns: ['id', 'hid', 't', 'singleton', 'longp', 'short', 'lbal', 'sbal', 'hhid2', 'clust']\n",
      "Column 'id' exists. First 5 values: [nan, 1000.0, 1000.0, 1000.0, 1001.0]\n",
      "Unique values in id: [nan, 1000.0, 1001.0, 1002.0, 1003.0, 1004.0, 1005.0, 1006.0, 1007.0, 1008.0]\n",
      "Column 't' exists. First 5 values: [nan, 1.0, 2.0, 3.0, 1.0]\n",
      "Unique values in t: [nan, 1.0, 2.0, 3.0, 4.0]\n",
      "Column 'participate2' exists. First 5 values: [nan, 0.0, 1.0, 1.0, 1.0]\n",
      "Unique values in participate2: [nan, 0.0, 1.0]\n",
      "Column 'SSP' exists. First 5 values: [nan, 1.0, 1.0, 1.0, 1.0]\n",
      "Unique values in SSP: [nan, 0.0, 1.0]\n",
      "\n",
      "Initial record count: 11074\n",
      "After SSP filter: 10264 records (removed 810)\n",
      "After rounds filter: 7603 records (removed 2661)\n",
      "Rounds in data after filter: [1.0, 2.0, 3.0]\n",
      "Number of rounds per household:\n",
      "  3 rounds: 1826 households\n",
      "  2 rounds: 838 households\n",
      "  1 rounds: 449 households\n",
      "Households with all 3 rounds: 1826 (out of 3113 total households)\n",
      "After complete panel filter: 5478 records\n",
      "\n",
      "Household summary statistics created. Shape: (1826, 2)\n",
      "Classification results:\n",
      "  Always participated: 1040 households\n",
      "  Never participated: 73 households\n",
      "  Mixed behavior: 713 households\n",
      "\n",
      "Starting detailed classification of 713 mixed households\n",
      "Examining 5 sample households:\n",
      "  Household 1000.0: rounds=[1.0, 2.0, 3.0], participate2=[0.0, 1.0, 1.0]\n",
      "  Household 1008.0: rounds=[1.0, 2.0, 3.0], participate2=[1.0, 1.0, 1.0]\n",
      "  Household 1012.0: rounds=[1.0, 2.0, 3.0], participate2=[0.0, 1.0, 0.0]\n",
      "  Household 1013.0: rounds=[1.0, 2.0, 3.0], participate2=[0.0, 1.0, 1.0]\n",
      "  Household 1014.0: rounds=[1.0, 2.0, 3.0], participate2=[0.0, 1.0, 0.0]\n",
      "Found 713 households with mixed behavior for detailed classification\n",
      "Original classification:\n",
      "  Became commercial: 278 (39.0%)\n",
      "  Stopped commercial: 42 (5.9%)\n",
      "  Fluctuating: 393 (55.1%)\n",
      "Relaxed classification:\n",
      "  Became commercial: 352 (49.4%)\n",
      "  Stopped commercial: 194 (27.2%)\n",
      "  Fluctuating: 167 (23.4%)\n",
      "\n",
      "========================================\n",
      "Processing country: NGR (Nigeria)\n",
      "========================================\n",
      "Successfully read file. Shape: (12844, 1268)\n",
      "First 10 columns: ['hhid2', 'id', 't', 's', 'j', 'singleton', 'short', 'lbal', 'sbal', 'clust']\n",
      "Column 'id' exists. First 5 values: [10057.0, 10065.0, 10057.0, 19023.0, 19284.0]\n",
      "Unique values in id: [10001.0, 10002.0, 10003.0, 10004.0, 10005.0, 10008.0, 10009.0, 10010.0, 10011.0, 10012.0]\n",
      "Column 't' exists. First 5 values: [2.0, 3.0, 3.0, 4.0, 4.0]\n",
      "Unique values in t: [1.0, 2.0, 3.0, 4.0]\n",
      "Column 'participate2' exists. First 5 values: [1.0, 0.0, 1.0, 0.0, 0.0]\n",
      "Unique values in participate2: [0.0, 1.0]\n",
      "Column 'SSP' exists. First 5 values: [1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "Unique values in SSP: [0.0, 1.0]\n",
      "\n",
      "Initial record count: 12844\n",
      "After SSP filter: 11274 records (removed 1570)\n",
      "After rounds filter: 11274 records (removed 0)\n",
      "Rounds in data after filter: [1.0, 2.0, 3.0, 4.0]\n",
      "Number of rounds per household:\n",
      "  1 rounds: 3022 households\n",
      "  3 rounds: 1588 households\n",
      "  2 rounds: 818 households\n",
      "  4 rounds: 463 households\n",
      "Households with all 4 rounds: 463 (out of 5891 total households)\n",
      "After complete panel filter: 1852 records\n",
      "\n",
      "Household summary statistics created. Shape: (463, 2)\n",
      "Classification results:\n",
      "  Always participated: 202 households\n",
      "  Never participated: 4 households\n",
      "  Mixed behavior: 257 households\n",
      "\n",
      "Starting detailed classification of 257 mixed households\n",
      "Examining 5 sample households:\n",
      "  Household 10061.0: rounds=[1.0, 2.0, 3.0, 4.0], participate2=[1.0, 1.0, 1.0, 1.0]\n",
      "  Household 10062.0: rounds=[1.0, 2.0, 3.0, 4.0], participate2=[1.0, 1.0, 1.0, 1.0]\n",
      "  Household 10063.0: rounds=[1.0, 2.0, 3.0, 4.0], participate2=[1.0, 1.0, 1.0, 1.0]\n",
      "  Household 10064.0: rounds=[1.0, 2.0, 3.0, 4.0], participate2=[0.0, 1.0, 1.0, 1.0]\n",
      "  Household 10065.0: rounds=[1.0, 2.0, 3.0, 4.0], participate2=[0.0, 1.0, 0.0, 1.0]\n",
      "Found 257 households with mixed behavior for detailed classification\n",
      "Original classification:\n",
      "  Became commercial: 76 (29.6%)\n",
      "  Stopped commercial: 17 (6.6%)\n",
      "  Fluctuating: 164 (63.8%)\n",
      "Relaxed classification:\n",
      "  Became commercial: 84 (32.7%)\n",
      "  Stopped commercial: 66 (25.7%)\n",
      "  Fluctuating: 107 (41.6%)\n",
      "\n",
      "========================================\n",
      "Processing country: MLW (Malawi)\n",
      "========================================\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\587987423.py:102: UnicodeWarning: \n",
      "One or more strings in the dta file could not be decoded using utf-8, and\n",
      "so the fallback encoding of latin-1 is being used.  This can happen when a file\n",
      "has been incorrectly encoded by Stata or some other software. You should verify\n",
      "the string values returned are correct.\n",
      "  df = pd.read_stata(path, convert_categoricals=False)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfully read file. Shape: (7187, 1055)\n",
      "First 10 columns: ['id', 't', 'hid', 'singleton', 'short', 'lbal', 'sbal', 'i', 'fid', 'clust']\n",
      "Column 'id' exists. First 5 values: [1755.0, 2698.0, 3125.0, 1712.0, 288.0]\n",
      "Unique values in id: [1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0]\n",
      "Column 't' exists. First 5 values: [1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "Unique values in t: [1.0, 2.0, 3.0, 4.0]\n",
      "Column 'participate2' exists. First 5 values: [0.0, 0.0, 0.0, 0.0, 0.0]\n",
      "Unique values in participate2: [0.0, 1.0]\n",
      "Column 'SSP' exists. First 5 values: [1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "Unique values in SSP: [0.0, 1.0]\n",
      "\n",
      "Initial record count: 7187\n",
      "After SSP filter: 6403 records (removed 784)\n",
      "After rounds filter: 6403 records (removed 0)\n",
      "Rounds in data after filter: [1.0, 2.0, 3.0, 4.0]\n",
      "Number of rounds per household:\n",
      "  1 rounds: 926 households\n",
      "  4 rounds: 708 households\n",
      "  2 rounds: 595 households\n",
      "  3 rounds: 485 households\n",
      "Households with all 4 rounds: 708 (out of 2714 total households)\n",
      "After complete panel filter: 2832 records\n",
      "\n",
      "Household summary statistics created. Shape: (708, 2)\n",
      "Classification results:\n",
      "  Always participated: 131 households\n",
      "  Never participated: 82 households\n",
      "  Mixed behavior: 495 households\n",
      "\n",
      "Starting detailed classification of 495 mixed households\n",
      "Examining 5 sample households:\n",
      "  Household 1.0: rounds=[1.0, 2.0, 3.0, 4.0], participate2=[0.0, 1.0, 0.0, 0.0]\n",
      "  Household 3.0: rounds=[1.0, 2.0, 3.0, 4.0], participate2=[1.0, 1.0, 1.0, 1.0]\n",
      "  Household 5.0: rounds=[1.0, 2.0, 3.0, 4.0], participate2=[1.0, 1.0, 0.0, 1.0]\n",
      "  Household 14.0: rounds=[1.0, 2.0, 3.0, 4.0], participate2=[1.0, 0.0, 1.0, 0.0]\n",
      "  Household 15.0: rounds=[1.0, 2.0, 3.0, 4.0], participate2=[1.0, 1.0, 1.0, 1.0]\n",
      "Found 495 households with mixed behavior for detailed classification\n",
      "Original classification:\n",
      "  Became commercial: 50 (10.1%)\n",
      "  Stopped commercial: 86 (17.4%)\n",
      "  Fluctuating: 359 (72.5%)\n",
      "Relaxed classification:\n",
      "  Became commercial: 88 (17.8%)\n",
      "  Stopped commercial: 110 (22.2%)\n",
      "  Fluctuating: 297 (60.0%)\n",
      "\n",
      "========================================\n",
      "Processing country: TZN (Tanzania)\n",
      "========================================\n",
      "Successfully read file. Shape: (10830, 1365)\n",
      "First 10 columns: ['id', 'hid', 't', 'singleton', 'longp', 'short', 'lbal', 'sbal', 'i', 'clust']\n",
      "Column 'id' exists. First 5 values: [3.0, 2.0, 1.0, 9.0, 5.0]\n",
      "Unique values in id: [1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]\n",
      "Column 't' exists. First 5 values: [1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "Unique values in t: [1.0, 2.0, 3.0, 4.0, 5.0]\n",
      "Column 'participate2' exists. First 5 values: [1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "Unique values in participate2: [0.0, 1.0]\n",
      "Column 'SSP' exists. First 5 values: [1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "Unique values in SSP: [0.0, 1.0]\n",
      "\n",
      "Initial record count: 10830\n",
      "After SSP filter: 9545 records (removed 1285)\n",
      "After rounds filter: 5828 records (removed 3717)\n",
      "Rounds in data after filter: [1.0, 2.0, 3.0]\n",
      "Number of rounds per household:\n",
      "  3 rounds: 1205 households\n",
      "  1 rounds: 907 households\n",
      "  2 rounds: 653 households\n",
      "Households with all 3 rounds: 1205 (out of 2765 total households)\n",
      "After complete panel filter: 3615 records\n",
      "\n",
      "Household summary statistics created. Shape: (1205, 2)\n",
      "Classification results:\n",
      "  Always participated: 615 households\n",
      "  Never participated: 61 households\n",
      "  Mixed behavior: 529 households\n",
      "\n",
      "Starting detailed classification of 529 mixed households\n",
      "Examining 5 sample households:\n",
      "  Household 1.0: rounds=[1.0, 2.0, 3.0], participate2=[1.0, 1.0, 0.0]\n",
      "  Household 2.0: rounds=[1.0, 2.0, 3.0], participate2=[1.0, 1.0, 1.0]\n",
      "  Household 3.0: rounds=[1.0, 2.0, 3.0], participate2=[1.0, 0.0, 0.0]\n",
      "  Household 5.0: rounds=[1.0, 2.0, 3.0], participate2=[1.0, 1.0, 1.0]\n",
      "  Household 6.0: rounds=[1.0, 2.0, 3.0], participate2=[1.0, 1.0, 0.0]\n",
      "Found 529 households with mixed behavior for detailed classification\n",
      "Original classification:\n",
      "  Became commercial: 270 (51.0%)\n",
      "  Stopped commercial: 33 (6.2%)\n",
      "  Fluctuating: 226 (42.7%)\n",
      "Relaxed classification:\n",
      "  Became commercial: 335 (63.3%)\n",
      "  Stopped commercial: 100 (18.9%)\n",
      "  Fluctuating: 94 (17.8%)\n",
      "\n",
      "========================================\n",
      "Processing country: UGD (Uganda)\n",
      "========================================\n",
      "Successfully read file. Shape: (11384, 1181)\n",
      "First 10 columns: ['id', 'HHID', 't', 'singleton', 'longp', 'medip', 'short', 'lbal', 'mbal', 'sbal']\n",
      "Column 'id' exists. First 5 values: [1013000204.0, 1013000204.0, 1013000210.0, 1013000210.0, 1021002610.0]\n",
      "Unique values in id: [206.0, 238.0, 326.0, 379.0, 506.0, 993.0, 1329.0, 1474.0, 1574.0, 1671.0]\n",
      "Column 't' exists. First 5 values: [1.0, 3.0, 3.0, 5.0, 1.0]\n",
      "Unique values in t: [1.0, 2.0, 3.0, 4.0, 5.0]\n",
      "Column 'participate2' exists. First 5 values: [1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "Unique values in participate2: [0.0, 1.0]\n",
      "Column 'SSP' exists. First 5 values: [1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "Unique values in SSP: [0.0, 1.0]\n",
      "\n",
      "Initial record count: 11384\n",
      "After SSP filter: 10639 records (removed 745)\n",
      "After rounds filter: 10639 records (removed 0)\n",
      "Rounds in data after filter: [1.0, 2.0, 3.0, 4.0, 5.0]\n",
      "Number of rounds per household:\n",
      "  2 rounds: 1140 households\n",
      "  3 rounds: 934 households\n",
      "  5 rounds: 693 households\n",
      "  1 rounds: 572 households\n",
      "  4 rounds: 380 households\n",
      "Households with all 5 rounds: 693 (out of 3719 total households)\n",
      "After complete panel filter: 3465 records\n",
      "\n",
      "Household summary statistics created. Shape: (693, 2)\n",
      "Classification results:\n",
      "  Always participated: 322 households\n",
      "  Never participated: 7 households\n",
      "  Mixed behavior: 364 households\n",
      "\n",
      "Starting detailed classification of 364 mixed households\n",
      "Examining 5 sample households:\n",
      "  Household 1033000301.0: rounds=[1.0, 2.0, 3.0, 4.0, 5.0], participate2=[1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "  Household 1033000308.0: rounds=[1.0, 2.0, 3.0, 4.0, 5.0], participate2=[1.0, 1.0, 1.0, 1.0, 0.0]\n",
      "  Household 1033000504.0: rounds=[1.0, 2.0, 3.0, 4.0, 5.0], participate2=[1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "  Household 1033000505.0: rounds=[1.0, 2.0, 3.0, 4.0, 5.0], participate2=[1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "  Household 1033000507.0: rounds=[1.0, 2.0, 3.0, 4.0, 5.0], participate2=[1.0, 1.0, 1.0, 1.0, 1.0]\n",
      "Found 364 households with mixed behavior for detailed classification\n",
      "Original classification:\n",
      "  Became commercial: 47 (12.9%)\n",
      "  Stopped commercial: 31 (8.5%)\n",
      "  Fluctuating: 286 (78.6%)\n",
      "Relaxed classification:\n",
      "  Became commercial: 51 (14.0%)\n",
      "  Stopped commercial: 73 (20.1%)\n",
      "  Fluctuating: 240 (65.9%)\n",
      "\n",
      "SUMMARY OF PROCESSING:\n",
      "ETH: SUCCESS - 1826 households processed\n",
      "NGR: SUCCESS - 463 households processed\n",
      "MLW: SUCCESS - 708 households processed\n",
      "TZN: SUCCESS - 1205 households processed\n",
      "UGD: SUCCESS - 693 households processed\n",
      "\n",
      "Preparing Excel output...\n",
      "Unified panel analysis results saved at C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\Excel-Mauriio\\unified_panel_analysis_debug.xlsx\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "# --- Classification Functions ---\n",
    "\n",
    "def classify_mixed(seq):\n",
    "    \"\"\"\n",
    "    Original classification.\n",
    "    - Became Commercial: If the household starts with 0, then at the first occurrence of 1, all subsequent rounds are 1,\n",
    "      and there are at least 2 rounds of commercial activity.\n",
    "    - Stopped Commercial: If the household starts with 1 and the final rounds (at least 2) are 0.\n",
    "    - Fluctuating: Otherwise.\n",
    "    \"\"\"\n",
    "    if seq[0] == 0:\n",
    "        try:\n",
    "            first_commercial = seq.index(1)\n",
    "        except ValueError:\n",
    "            first_commercial = len(seq)\n",
    "        if all(x == 1 for x in seq[first_commercial:]) and (len(seq) - first_commercial >= 2):\n",
    "            return \"became_commercial\"\n",
    "    if seq[0] == 1:\n",
    "        reversed_seq = seq[::-1]\n",
    "        count_zeros = 0\n",
    "        for val in reversed_seq:\n",
    "            if val == 0:\n",
    "                count_zeros += 1\n",
    "            else:\n",
    "                break\n",
    "        if count_zeros >= 2:\n",
    "            return \"stopped_commercial\"\n",
    "    return \"fluctuating\"\n",
    "\n",
    "def classify_mixed_relaxed(seq):\n",
    "    \"\"\"\n",
    "    Relaxed classification.\n",
    "    - Became Commercial Relaxed: If the household starts with 0 and then from the first occurrence of 1 all subsequent rounds are 1,\n",
    "      regardless of the number of rounds.\n",
    "    - Stopped Commercial Relaxed: If the household starts with 1 and then from the first occurrence of 0 all subsequent rounds are 0.\n",
    "    - Fluctuating Relaxed: Otherwise.\n",
    "    \"\"\"\n",
    "    if seq[0] == 0:\n",
    "        try:\n",
    "            first_commercial = seq.index(1)\n",
    "        except ValueError:\n",
    "            first_commercial = len(seq)\n",
    "        if all(x == 1 for x in seq[first_commercial:]):\n",
    "            return \"became_commercial_relaxed\"\n",
    "    if seq[0] == 1:\n",
    "        try:\n",
    "            first_noncommercial = seq.index(0)\n",
    "        except ValueError:\n",
    "            first_noncommercial = len(seq)\n",
    "        if all(x == 0 for x in seq[first_noncommercial:]):\n",
    "            return \"stopped_commercial_relaxed\"\n",
    "    return \"fluctuating_relaxed\"\n",
    "\n",
    "# --- Panel Configuration ---\n",
    "# Note: Ghana is excluded from panel analysis.\n",
    "panel_config = {\n",
    "    \"ETH\": {\"id\": \"id\", \"rounds\": [1, 2, 3]},\n",
    "    \"NGR\": {\"id\": \"id\", \"rounds\": [1, 2, 3, 4]},\n",
    "    \"MLW\": {\"id\": \"id\", \"rounds\": [1, 2, 3, 4]},\n",
    "    \"TZN\": {\"id\": \"id\", \"rounds\": [1, 2, 3]},\n",
    "    \"UGD\": {\"id\": \"id\", \"rounds\": [1, 2, 3, 4, 5]}\n",
    "}\n",
    "\n",
    "# --- Data Paths and Country Names ---\n",
    "data_paths = {\n",
    "    \"MLW\": r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\MalPanel-5.dta\",\n",
    "    \"ETH\": r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\EthPanel-5.dta\",\n",
    "    \"UGD\": r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\UgaPanel-6.dta\",\n",
    "    \"TZN\": r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\TanPanel-5.dta\",\n",
    "    \"NGR\": r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\NigPanel-6.dta\",\n",
    "}\n",
    "country_names = {\n",
    "    \"MLW\": \"Malawi\",\n",
    "    \"ETH\": \"Ethiopia\",\n",
    "    \"UGD\": \"Uganda\",\n",
    "    \"TZN\": \"Tanzania\",\n",
    "    \"NGR\": \"Nigeria\",\n",
    "    \"GHN\": \"Ghana\"\n",
    "}\n",
    "\n",
    "# --- Dictionaries to Hold Results ---\n",
    "panel_results = {}\n",
    "original_results = {}\n",
    "relaxed_results = {}\n",
    "debug_info = {}\n",
    "\n",
    "for code, config in panel_config.items():\n",
    "    print(f\"\\n{'='*40}\")\n",
    "    print(f\"Processing country: {code} ({country_names[code]})\")\n",
    "    print(f\"{'='*40}\")\n",
    "    \n",
    "    panel_id = config[\"id\"]\n",
    "    rounds = config[\"rounds\"]\n",
    "    path = data_paths[code]\n",
    "    \n",
    "    try:\n",
    "        # Try with explicit encoding parameters\n",
    "        df = pd.read_stata(path, convert_categoricals=False)\n",
    "        print(f\"Successfully read file. Shape: {df.shape}\")\n",
    "        \n",
    "        # Debug: Check if critical columns exist\n",
    "        columns = list(df.columns)\n",
    "        print(f\"First 10 columns: {columns[:10]}\")\n",
    "        \n",
    "        critical_cols = [panel_id, \"t\", \"participate2\", \"SSP\"]\n",
    "        for col in critical_cols:\n",
    "            if col in df.columns:\n",
    "                print(f\"Column '{col}' exists. First 5 values: {df[col].head(5).tolist()}\")\n",
    "                print(f\"Unique values in {col}: {sorted(df[col].unique().tolist())[:10]}\")\n",
    "            else:\n",
    "                print(f\"WARNING: Column '{col}' DOES NOT EXIST in the dataset!\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error reading {path}: {e}\")\n",
    "        debug_info[code] = {\"error\": f\"Failed to read file: {str(e)}\"}\n",
    "        continue\n",
    "\n",
    "    # Debug: Initial count\n",
    "    print(f\"\\nInitial record count: {len(df)}\")\n",
    "    \n",
    "    # **** Filter: Keep only observations where SSP == 1 ****\n",
    "    if \"SSP\" in df.columns:\n",
    "        ssp_count_before = len(df)\n",
    "        df = df[df[\"SSP\"] == 1].copy()\n",
    "        print(f\"After SSP filter: {len(df)} records (removed {ssp_count_before - len(df)})\")\n",
    "    else:\n",
    "        print(f\"SSP variable not found in {code}, skipping SSP filtering.\")\n",
    "\n",
    "    # Filter to only include the specified rounds for this country\n",
    "    if \"t\" in df.columns:\n",
    "        rounds_count_before = len(df)\n",
    "        df_panel = df[df[\"t\"].isin(rounds)].copy()\n",
    "        print(f\"After rounds filter: {len(df_panel)} records (removed {rounds_count_before - len(df_panel)})\")\n",
    "        print(f\"Rounds in data after filter: {sorted(df_panel['t'].unique().tolist())}\")\n",
    "    else:\n",
    "        print(f\"ERROR: 't' variable not found in {code} dataset!\")\n",
    "        debug_info[code] = {\"error\": \"No 't' variable found\"}\n",
    "        continue\n",
    "    \n",
    "    # Retain only households that appear in all rounds\n",
    "    if panel_id in df.columns:\n",
    "        required_rounds = len(rounds)\n",
    "        household_count_before = df_panel[panel_id].nunique()\n",
    "        \n",
    "        # Debug: Show distribution of rounds per household\n",
    "        round_counts = df_panel.groupby(panel_id)[\"t\"].nunique().value_counts()\n",
    "        print(f\"Number of rounds per household:\")\n",
    "        for n_rounds, count in round_counts.items():\n",
    "            print(f\"  {n_rounds} rounds: {count} households\")\n",
    "        \n",
    "        # Keep only households with all rounds\n",
    "        counts = df_panel.groupby(panel_id)[\"t\"].nunique()\n",
    "        valid_ids = counts[counts == required_rounds].index\n",
    "        print(f\"Households with all {required_rounds} rounds: {len(valid_ids)} \" +\n",
    "              f\"(out of {household_count_before} total households)\")\n",
    "        \n",
    "        df_panel = df_panel[df_panel[panel_id].isin(valid_ids)]\n",
    "        print(f\"After complete panel filter: {len(df_panel)} records\")\n",
    "    else:\n",
    "        print(f\"ERROR: Household ID variable '{panel_id}' not found in {code} dataset!\")\n",
    "        debug_info[code] = {\"error\": f\"No '{panel_id}' variable found\"}\n",
    "        continue\n",
    "    \n",
    "    # Check if participate2 column exists\n",
    "    if \"participate2\" not in df_panel.columns:\n",
    "        print(f\"ERROR: 'participate2' variable not found in {code} dataset!\")\n",
    "        # Check if we can create it from 'prop' as a fallback\n",
    "        if \"prop\" in df_panel.columns:\n",
    "            print(\"Creating 'participate2' from 'prop' as a fallback\")\n",
    "            df_panel[\"participate2\"] = (df_panel[\"prop\"] > 0).astype(int)\n",
    "            print(f\"Created 'participate2' with values: {df_panel['participate2'].value_counts().to_dict()}\")\n",
    "        else:\n",
    "            debug_info[code] = {\"error\": \"No 'participate2' or 'prop' variable found\"}\n",
    "            continue\n",
    "    \n",
    "    # --- PART 1: Basic Classification ---\n",
    "    try:\n",
    "        # Compute, for each household, the minimum and maximum of 'participate2'\n",
    "        summary = df_panel.groupby(panel_id)[\"participate2\"].agg([\"min\", \"max\"])\n",
    "        print(f\"\\nHousehold summary statistics created. Shape: {summary.shape}\")\n",
    "        \n",
    "        always_sold = (summary[\"min\"] > 0).sum()   # Participated in every round\n",
    "        never_sold = (summary[\"max\"] == 0).sum()   # Never participated\n",
    "        mixed = len(summary) - always_sold - never_sold  # Mixed behavior\n",
    "        \n",
    "        print(f\"Classification results:\")\n",
    "        print(f\"  Always participated: {always_sold} households\")\n",
    "        print(f\"  Never participated: {never_sold} households\")\n",
    "        print(f\"  Mixed behavior: {mixed} households\")\n",
    "        \n",
    "        total = len(summary)\n",
    "        panel_results[code] = {\n",
    "            \"Total Households\": total,\n",
    "            \"Always Participated (%)\": always_sold / total * 100 if total > 0 else np.nan,\n",
    "            \"Never Participated (%)\": never_sold / total * 100 if total > 0 else np.nan,\n",
    "            \"Mixed (%)\": mixed / total * 100 if total > 0 else np.nan\n",
    "        }\n",
    "    except Exception as e:\n",
    "        print(f\"Error in basic classification: {e}\")\n",
    "        debug_info[code] = {\"error\": f\"Error in basic classification: {str(e)}\"}\n",
    "        continue\n",
    "    \n",
    "    # --- PART 2: Detailed Classification of Mixed Households ---\n",
    "    try:\n",
    "        # Group by household (sorted by round)\n",
    "        grouped = df_panel.sort_values(\"t\").groupby(panel_id)\n",
    "        print(f\"\\nStarting detailed classification of {mixed} mixed households\")\n",
    "        \n",
    "        orig_classifications = {}\n",
    "        relaxed_classifications = {}\n",
    "        mixed_ids = []\n",
    "        \n",
    "        # Debug: Check a few household sequences\n",
    "        sample_count = min(5, len(grouped))\n",
    "        print(f\"Examining {sample_count} sample households:\")\n",
    "        \n",
    "        for i, (hh, group) in enumerate(grouped):\n",
    "            if i >= sample_count:\n",
    "                break\n",
    "                \n",
    "            # Sort group by round and get 'participate2' values\n",
    "            seq = list(group.sort_values(\"t\")[\"participate2\"])\n",
    "            print(f\"  Household {hh}: rounds={list(group.sort_values('t')['t'])}, participate2={seq}\")\n",
    "        \n",
    "        # Now process all households\n",
    "        for hh, group in grouped:\n",
    "            # Sort group by round and get 'participate2' values\n",
    "            seq = list(group.sort_values(\"t\")[\"participate2\"])\n",
    "            # Already binary: 1 if participated, 0 otherwise\n",
    "            binary_seq = [1 if (pd.notnull(x) and x > 0) else 0 for x in seq]\n",
    "            # Only consider households with mixed behavior (not all 0's or all 1's)\n",
    "            if all(x == 0 for x in binary_seq) or all(x == 1 for x in binary_seq):\n",
    "                continue\n",
    "            orig_classifications[hh] = classify_mixed(binary_seq)\n",
    "            relaxed_classifications[hh] = classify_mixed_relaxed(binary_seq)\n",
    "            mixed_ids.append(hh)\n",
    "        \n",
    "        total_mixed = len(mixed_ids)\n",
    "        print(f\"Found {total_mixed} households with mixed behavior for detailed classification\")\n",
    "        \n",
    "        if total_mixed == 0:\n",
    "            print(\"No mixed households for detailed classification!\")\n",
    "            original_results[code] = {\n",
    "                \"Total Mixed Households\": 0,\n",
    "                \"Became Commercial (%)\": np.nan,\n",
    "                \"Stopped Commercial (%)\": np.nan,\n",
    "                \"Fluctuating (%)\": np.nan\n",
    "            }\n",
    "            relaxed_results[code] = {\n",
    "                \"Total Mixed Households\": 0,\n",
    "                \"Became Commercial (%)\": np.nan,\n",
    "                \"Stopped Commercial (%)\": np.nan,\n",
    "                \"Fluctuating (%)\": np.nan\n",
    "            }\n",
    "        else:\n",
    "            count_became = sum(1 for v in orig_classifications.values() if v == \"became_commercial\")\n",
    "            count_stopped = sum(1 for v in orig_classifications.values() if v == \"stopped_commercial\")\n",
    "            count_fluct = sum(1 for v in orig_classifications.values() if v == \"fluctuating\")\n",
    "            \n",
    "            print(f\"Original classification:\")\n",
    "            print(f\"  Became commercial: {count_became} ({count_became/total_mixed*100:.1f}%)\")\n",
    "            print(f\"  Stopped commercial: {count_stopped} ({count_stopped/total_mixed*100:.1f}%)\")\n",
    "            print(f\"  Fluctuating: {count_fluct} ({count_fluct/total_mixed*100:.1f}%)\")\n",
    "            \n",
    "            original_results[code] = {\n",
    "                \"Total Mixed Households\": total_mixed,\n",
    "                \"Became Commercial (%)\": count_became / total_mixed * 100,\n",
    "                \"Stopped Commercial (%)\": count_stopped / total_mixed * 100,\n",
    "                \"Fluctuating (%)\": count_fluct / total_mixed * 100\n",
    "            }\n",
    "            \n",
    "            count_became_relaxed = sum(1 for v in relaxed_classifications.values() if v == \"became_commercial_relaxed\")\n",
    "            count_stopped_relaxed = sum(1 for v in relaxed_classifications.values() if v == \"stopped_commercial_relaxed\")\n",
    "            count_fluct_relaxed = sum(1 for v in relaxed_classifications.values() if v == \"fluctuating_relaxed\")\n",
    "            \n",
    "            print(f\"Relaxed classification:\")\n",
    "            print(f\"  Became commercial: {count_became_relaxed} ({count_became_relaxed/total_mixed*100:.1f}%)\")\n",
    "            print(f\"  Stopped commercial: {count_stopped_relaxed} ({count_stopped_relaxed/total_mixed*100:.1f}%)\")\n",
    "            print(f\"  Fluctuating: {count_fluct_relaxed} ({count_fluct_relaxed/total_mixed*100:.1f}%)\")\n",
    "            \n",
    "            relaxed_results[code] = {\n",
    "                \"Total Mixed Households\": total_mixed,\n",
    "                \"Became Commercial (%)\": count_became_relaxed / total_mixed * 100,\n",
    "                \"Stopped Commercial (%)\": count_stopped_relaxed / total_mixed * 100,\n",
    "                \"Fluctuating (%)\": count_fluct_relaxed / total_mixed * 100\n",
    "            }\n",
    "    except Exception as e:\n",
    "        print(f\"Error in detailed classification: {e}\")\n",
    "        debug_info[code] = {\"error\": f\"Error in detailed classification: {str(e)}\"}\n",
    "\n",
    "# Print summary of debugging info\n",
    "print(\"\\nSUMMARY OF PROCESSING:\")\n",
    "for code in panel_config.keys():\n",
    "    if code in debug_info:\n",
    "        print(f\"{code}: ERROR - {debug_info[code].get('error', 'Unknown error')}\")\n",
    "    elif code in panel_results:\n",
    "        print(f\"{code}: SUCCESS - {panel_results[code]['Total Households']} households processed\")\n",
    "    else:\n",
    "        print(f\"{code}: UNKNOWN STATUS\")\n",
    "\n",
    "# --- Prepare Final DataFrames ---\n",
    "\n",
    "# Basic panel results\n",
    "panel_results_df = pd.DataFrame(panel_results).T.reset_index()\n",
    "panel_results_df.rename(columns={\"index\": \"Country_Code\"}, inplace=True)\n",
    "panel_results_df[\"Country\"] = panel_results_df[\"Country_Code\"].map(country_names)\n",
    "panel_results_df = panel_results_df[[\"Country\", \"Total Households\", \n",
    "                                      \"Always Participated (%)\", \"Never Participated (%)\", \"Mixed (%)\"]]\n",
    "\n",
    "# Mixed household detailed classification - Original criteria\n",
    "orig_df = pd.DataFrame(original_results).T.reset_index()\n",
    "orig_df.rename(columns={\"index\": \"Country_Code\"}, inplace=True)\n",
    "orig_df[\"Country\"] = orig_df[\"Country_Code\"].map(country_names)\n",
    "orig_df = orig_df[[\"Country\", \"Total Mixed Households\", \"Became Commercial (%)\", \n",
    "                   \"Stopped Commercial (%)\", \"Fluctuating (%)\"]]\n",
    "\n",
    "# Mixed household detailed classification - Relaxed criteria\n",
    "relaxed_df = pd.DataFrame(relaxed_results).T.reset_index()\n",
    "relaxed_df.rename(columns={\"index\": \"Country_Code\"}, inplace=True)\n",
    "relaxed_df[\"Country\"] = relaxed_df[\"Country_Code\"].map(country_names)\n",
    "relaxed_df = relaxed_df[[\"Country\", \"Total Mixed Households\", \"Became Commercial (%)\", \n",
    "                         \"Stopped Commercial (%)\", \"Fluctuating (%)\"]]\n",
    "\n",
    "print(\"\\nPreparing Excel output...\")\n",
    "# --- Write Results to Excel ---\n",
    "output_folder = r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\Excel-Mauriio\"\n",
    "output_file = os.path.join(output_folder, \"unified_panel_analysis_debug.xlsx\")\n",
    "\n",
    "with pd.ExcelWriter(output_file, engine=\"openpyxl\") as writer:\n",
    "    panel_results_df.to_excel(writer, sheet_name=\"Basic_Panel_Results\", index=False)\n",
    "    orig_df.to_excel(writer, sheet_name=\"Mixed_Breakdown_Original\", index=False)\n",
    "    relaxed_df.to_excel(writer, sheet_name=\"Mixed_Breakdown_Relaxed\", index=False)\n",
    "\n",
    "print(f\"Unified panel analysis results saved at {output_file}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf4b900f",
   "metadata": {},
   "source": [
    "Appendix B, table B5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c72d9514",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:171: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
      "  df['employ'] = ((df['ac_ent'] == 1) | (df['ac_salaried'] == 1)).astype(float)\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:168: UnicodeWarning: \n",
      "One or more strings in the dta file could not be decoded using utf-8, and\n",
      "so the fallback encoding of latin-1 is being used.  This can happen when a file\n",
      "has been incorrectly encoded by Stata or some other software. You should verify\n",
      "the string values returned are correct.\n",
      "  df = pd.read_stata(file_path)\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:171: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
      "  df['employ'] = ((df['ac_ent'] == 1) | (df['ac_salaried'] == 1)).astype(float)\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:171: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
      "  df['employ'] = ((df['ac_ent'] == 1) | (df['ac_salaried'] == 1)).astype(float)\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:171: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
      "  df['employ'] = ((df['ac_ent'] == 1) | (df['ac_salaried'] == 1)).astype(float)\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:171: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
      "  df['employ'] = ((df['ac_ent'] == 1) | (df['ac_salaried'] == 1)).astype(float)\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
      "C:\\Users\\andre\\AppData\\Local\\Temp\\ipykernel_32616\\2306183075.py:73: DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, pd.CategoricalDtype) instead\n",
      "  if pd.api.types.is_categorical_dtype(df_group['poor']):\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Excel file created at:\n",
      "C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\Excel-Mauriio\\averages_and_ttests_output.xlsx\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import itertools\n",
    "from scipy.stats import ttest_ind, levene\n",
    "\n",
    "# Updated file paths and country names\n",
    "data_files = {\n",
    "    \"Ethiopia\": r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\EthPanel-5.dta\",\n",
    "    \"Malawi\": r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\MalPanel-5.dta\",\n",
    "    \"Nigeria\": r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\NigPanel-6.dta\",\n",
    "    \"Tanzania\": r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\TanPanel-5.dta\",\n",
    "    \"Uganda\": r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\UgaPanel-6.dta\"\n",
    "}\n",
    "\n",
    "# List of indicators (internal names) - added fpsh and Hungry\n",
    "indicators = ['FCS', 'RCI', 'EMP', 'MPI', 'Employment', 'fpsh', 'Hungry', 'Poor']\n",
    "\n",
    "# Mapping for renaming subgroup labels in the Excel output.\n",
    "# Note: Change 'k=0' from Subsistence to Non-selling.\n",
    "group_rename = {\n",
    "    'k=0': 'Non-selling',\n",
    "    'k=1': 'Tertile 1',\n",
    "    'k=2': 'Tertile 2',\n",
    "    'k=3': 'Tertile 3'\n",
    "}\n",
    "\n",
    "# Mapping for spider graph display labels - added fpsh and Hungry\n",
    "spider_labels = {\n",
    "    'FCS': 'FCS',\n",
    "    'RCI': 'RCI',\n",
    "    'EMP': 'EMP',\n",
    "    'MPI': 'Not MPI Poor',\n",
    "    'Employment': 'Employment',\n",
    "    'fpsh': 'FPSH',\n",
    "    'Hungry': 'Hungry'\n",
    "}\n",
    "\n",
    "def get_group_data(df, indicator, group_label, df_emp=None):\n",
    "    \"\"\"\n",
    "    Returns a Series with the indicator values for a given group.\n",
    "    For group 'Overall', the full dataset is used;\n",
    "    otherwise, the data are filtered for the subgroup (e.g., \"k=0\", \"k=1\", etc.).\n",
    "    \"\"\"\n",
    "    if group_label == \"Overall\":\n",
    "        df_group = df.copy()\n",
    "        if df_emp is not None:\n",
    "            df_group_emp = df_emp.copy()\n",
    "    else:\n",
    "        k_val = int(group_label.split('=')[1])\n",
    "        df_group = df[df['k'] == k_val]\n",
    "        if df_emp is not None:\n",
    "            df_group_emp = df_emp[df_emp['k'] == k_val]\n",
    "    \n",
    "    try:\n",
    "        if indicator == 'FCS':\n",
    "            # For FCS, use the raw value\n",
    "            values = df_group['FCS']\n",
    "            return values.dropna()\n",
    "        elif indicator == 'RCI':\n",
    "            # RCI is rci_norm * 100\n",
    "            values = df_group['rci_norm'] * 100\n",
    "            return values.dropna()\n",
    "        elif indicator == 'MPI':\n",
    "            # Percentage of MPI poor\n",
    "            values = df_group['mpi_poor'] * 100\n",
    "            return values.dropna()\n",
    "        elif indicator == 'Poor':\n",
    "            # Percentage of \"poor\"\n",
    "            if 'poor' not in df_group.columns:\n",
    "                print(f\"Warning: 'poor' column not found for {group_label}\")\n",
    "                return pd.Series([])\n",
    "            if pd.api.types.is_categorical_dtype(df_group['poor']):\n",
    "                values = (df_group['poor'] == 'Poor').astype(float) * 100\n",
    "            else:\n",
    "                values = df_group['poor'].astype(float) * 100\n",
    "            return values.dropna()\n",
    "        elif indicator == 'Employment':\n",
    "            # Off-farm employment\n",
    "            values = df_group['employ'] * 100\n",
    "            return values.dropna()\n",
    "        elif indicator == 'EMP':\n",
    "            # Empowerment (filtered dataset where t != 1)\n",
    "            values = df_group_emp['emp'] * 100\n",
    "            return values.dropna()\n",
    "        elif indicator == 'fpsh':\n",
    "            # fpsh * 100\n",
    "            values = df_group['fpsh'] * 100\n",
    "            return values.dropna()\n",
    "        elif indicator == 'Hungry':\n",
    "            # Binary indicator: 1 if FCS <= 21, else 0\n",
    "            values = (df_group['FCS'] <= 21).astype(float) * 100\n",
    "            return values.dropna()\n",
    "        else:\n",
    "            return pd.Series([])\n",
    "    except Exception as e:\n",
    "        print(f\"Error in get_group_data for indicator {indicator}, group {group_label}: {e}\")\n",
    "        return pd.Series([])\n",
    "\n",
    "def compute_average_for_indicator(df, indicator, group_label, df_emp=None):\n",
    "    \"\"\"\n",
    "    Computes the simple (unweighted) mean for the specified indicator and group.\n",
    "    \"\"\"\n",
    "    values = get_group_data(df, indicator, group_label, df_emp)\n",
    "    if values is None or len(values) == 0:\n",
    "        return np.nan\n",
    "    return values.mean()\n",
    "\n",
    "def compute_threshold_percentage(df, indicator, group_label, df_emp, overall_rci_threshold):\n",
    "    \"\"\"\n",
    "    Computes the percentage of households meeting the adequacy threshold for the given indicator\n",
    "    in the specified subgroup.\n",
    "    Thresholds:\n",
    "      - FCS: adequate if FCS >= 35\n",
    "      - RCI: adequate if rci_norm > overall_rci_threshold (25th percentile of the country)\n",
    "      - EMP: adequate if emp == 1 (filtered dataset)\n",
    "      - MPI: adequate if mpi_poor == 0\n",
    "      - Employment: adequate if employ == 1\n",
    "      - fpsh: adequate if fpsh == 1\n",
    "      - Hungry: indicator is (FCS <= 21)\n",
    "    \"\"\"\n",
    "    k_val = int(group_label.split('=')[1])\n",
    "    group_df = df[df['k'] == k_val]\n",
    "    if indicator == 'EMP':\n",
    "        group_df = df_emp[df_emp['k'] == k_val]\n",
    "    \n",
    "    if indicator == 'FCS':\n",
    "        condition = group_df['FCS'] >= 35\n",
    "        valid = group_df['FCS'].notna()\n",
    "    elif indicator == 'RCI':\n",
    "        condition = group_df['rci_norm'] > overall_rci_threshold\n",
    "        valid = group_df['rci_norm'].notna()\n",
    "    elif indicator == 'MPI':\n",
    "        condition = group_df['mpi_poor'] == 0\n",
    "        valid = group_df['mpi_poor'].notna()\n",
    "    elif indicator == 'Employment':\n",
    "        condition = group_df['employ'] == 1\n",
    "        valid = group_df['employ'].notna()\n",
    "    elif indicator == 'EMP':\n",
    "        condition = group_df['emp'] == 1\n",
    "        valid = group_df['emp'].notna()\n",
    "    elif indicator == 'fpsh':\n",
    "        condition = group_df['fpsh'] == 1\n",
    "        valid = group_df['fpsh'].notna()\n",
    "    elif indicator == 'Hungry':\n",
    "        condition = group_df['FCS'] <= 21\n",
    "        valid = group_df['FCS'].notna()\n",
    "    else:\n",
    "        return np.nan\n",
    "    \n",
    "    if valid.sum() == 0:\n",
    "        return np.nan\n",
    "    return condition.sum() / valid.sum() * 100\n",
    "\n",
    "# Dictionary to store results per country.\n",
    "# For each country, we store:\n",
    "# - 'Raw': a dictionary of simple averages for each group.\n",
    "# - 'TTests': a dictionary with pairwise t-test results for each indicator.\n",
    "# - 'Spider': a dictionary with threshold percentages for spider graphs (for k=1 and k=3).\n",
    "results = {}\n",
    "\n",
    "# Define groups: compute overall averages but for t-tests we only compare k=0 to k=3.\n",
    "groups = ['Overall', 'k=0', 'k=1', 'k=2', 'k=3']\n",
    "groups_for_ttests = ['k=0', 'k=1', 'k=2', 'k=3']\n",
    "\n",
    "for country, file_path in data_files.items():\n",
    "    # Read dataset\n",
    "    df = pd.read_stata(file_path)\n",
    "    \n",
    "    # Create the 'employ' variable (indicator for non-farm employment)\n",
    "    df['employ'] = ((df['ac_ent'] == 1) | (df['ac_salaried'] == 1)).astype(float)\n",
    "    \n",
    "    # Create filtered data for EMP (only when t != 1)\n",
    "    df_emp = df[df['t'] != 1].copy()\n",
    "    \n",
    "    raw_averages = {}\n",
    "    # Also store the underlying data (non-NaN values) for each indicator and group for t-tests.\n",
    "    underlying_data = {ind: {} for ind in indicators}\n",
    "    \n",
    "    for grp in groups:\n",
    "        raw_averages[grp] = {}\n",
    "        for ind in indicators:\n",
    "            if ind == 'EMP':\n",
    "                avg_val = compute_average_for_indicator(df, ind, grp, df_emp)\n",
    "                underlying_data[ind][grp] = get_group_data(df, ind, grp, df_emp)\n",
    "            else:\n",
    "                avg_val = compute_average_for_indicator(df, ind, grp)\n",
    "                underlying_data[ind][grp] = get_group_data(df, ind, grp)\n",
    "            raw_averages[grp][ind] = avg_val\n",
    "    \n",
    "    # Compute pairwise t-tests for each indicator (only among k=0 to k=3).\n",
    "    ttest_results = {}\n",
    "    for ind in indicators:\n",
    "        comparisons = []\n",
    "        for grp1, grp2 in itertools.combinations(groups_for_ttests, 2):\n",
    "            data1 = underlying_data[ind][grp1]\n",
    "            data2 = underlying_data[ind][grp2]\n",
    "            # Only test if both groups have at least two observations.\n",
    "            if len(data1) < 2 or len(data2) < 2:\n",
    "                continue\n",
    "            # Check homogeneity of variances using Levene's test (with median centering).\n",
    "            lev_stat, lev_p = levene(data1, data2, center='median')\n",
    "            equal_var = lev_p > 0.05\n",
    "            test_type = \"Equal variances\" if equal_var else \"Welch\"\n",
    "            t_stat, p_val = ttest_ind(data1, data2, equal_var=equal_var, nan_policy='omit')\n",
    "            conclusion = \"Not Statistically Different\" if p_val > 0.05 else \"Statistically Different\"\n",
    "            comparisons.append({\n",
    "                'Group1': grp1,\n",
    "                'Group2': grp2,\n",
    "                't-statistic': t_stat,\n",
    "                'p-value': p_val,\n",
    "                'Test Used': test_type,\n",
    "                'Levene p-value': lev_p,\n",
    "                'Conclusion': conclusion\n",
    "            })\n",
    "        # Store results in a DataFrame\n",
    "        df_tt = pd.DataFrame(comparisons)\n",
    "        \n",
    "        # For display in the country sheets, rename groups right away\n",
    "        if not df_tt.empty:\n",
    "            df_tt['Group1'] = df_tt['Group1'].replace(group_rename)\n",
    "            df_tt['Group2'] = df_tt['Group2'].replace(group_rename)\n",
    "        ttest_results[ind] = df_tt\n",
    "    \n",
    "    # Compute overall RCI threshold (25th percentile) for the country.\n",
    "    overall_rci_threshold = df['rci_norm'].quantile(0.25)\n",
    "    \n",
    "    # Compute spider graph threshold percentages for groups k=1 (Tertile 1) and k=3 (Tertile 3).\n",
    "    spider = {}\n",
    "    for ind in indicators:\n",
    "        perc_tert1 = compute_threshold_percentage(df, ind, 'k=1', df_emp, overall_rci_threshold)\n",
    "        perc_tert3 = compute_threshold_percentage(df, ind, 'k=3', df_emp, overall_rci_threshold)\n",
    "        spider[ind] = {'k=1': perc_tert1, 'k=3': perc_tert3}\n",
    "    \n",
    "    results[country] = {\n",
    "        'Raw': raw_averages,\n",
    "        'TTests': ttest_results,\n",
    "        'Spider': spider\n",
    "    }\n",
    "\n",
    "# Set the output Excel file path\n",
    "excel_filename = r'C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\Excel-Mauriio\\averages_and_ttests_output.xlsx'\n",
    "\n",
    "with pd.ExcelWriter(excel_filename, engine='xlsxwriter') as writer:\n",
    "    ########################################################################\n",
    "    # 1) Create one sheet per country with raw averages and t-test results\n",
    "    ########################################################################\n",
    "    for country, data in results.items():\n",
    "        sheet_name = country\n",
    "        raw_df = pd.DataFrame(data['Raw']).T\n",
    "        \n",
    "        # Rename row indices (k=0 -> Non-selling, etc.), but keep \"Overall\" as is\n",
    "        raw_df.index = raw_df.index.map(lambda x: group_rename.get(x, x))\n",
    "        \n",
    "        # Round numeric values to one decimal\n",
    "        raw_df = raw_df.round(1)\n",
    "        \n",
    "        # Rename \"MPI\" column to \"Not MPI Poor\" for display\n",
    "        if 'MPI' in raw_df.columns:\n",
    "            raw_df.rename(columns={'MPI': 'Not MPI Poor'}, inplace=True)\n",
    "        \n",
    "        raw_df.to_excel(writer, sheet_name=sheet_name, startrow=1, startcol=0)\n",
    "        worksheet = writer.sheets[sheet_name]\n",
    "        worksheet.write(0, 0, f\"{country} - Simple Averages (Unweighted)\")\n",
    "        \n",
    "        # Write the t-test results below the raw averages\n",
    "        start_row = raw_df.shape[0] + 4\n",
    "        for ind, df_tt in data['TTests'].items():\n",
    "            if not df_tt.empty:\n",
    "                # Round p-values, t-statistics, etc. to 1 decimal or so\n",
    "                df_tt = df_tt.round(1)\n",
    "                worksheet.write(start_row, 0, f\"T-tests for {spider_labels.get(ind, ind)}\")\n",
    "                df_tt.to_excel(writer, sheet_name=sheet_name, startrow=start_row+1, startcol=0, index=False)\n",
    "                start_row += df_tt.shape[0] + 4\n",
    "    \n",
    "    ########################################################################\n",
    "    # 2) Create an extra sheet for spider (radar) charts\n",
    "    ########################################################################\n",
    "    charts_ws = writer.book.add_worksheet(\"Spider Charts\")\n",
    "    writer.sheets[\"Spider Charts\"] = charts_ws\n",
    "    current_row = 0\n",
    "    \n",
    "    for country, data in results.items():\n",
    "        charts_ws.write(current_row, 0, f\"Country: {country}\")\n",
    "        current_row += 1\n",
    "        \n",
    "        charts_ws.write(current_row, 0, \"Indicator\")\n",
    "        charts_ws.write(current_row, 1, \"Tertile 1 (%)\")\n",
    "        charts_ws.write(current_row, 2, \"Tertile 3 (%)\")\n",
    "        current_row += 1\n",
    "        \n",
    "        start_table_row = current_row\n",
    "        for ind in indicators:\n",
    "            charts_ws.write(current_row, 0, spider_labels.get(ind, ind))\n",
    "            perc1 = data['Spider'][ind]['k=1']\n",
    "            perc3 = data['Spider'][ind]['k=3']\n",
    "            charts_ws.write(current_row, 1, round(perc1, 1) if pd.notna(perc1) else \"\")\n",
    "            charts_ws.write(current_row, 2, round(perc3, 1) if pd.notna(perc3) else \"\")\n",
    "            current_row += 1\n",
    "        \n",
    "        end_table_row = current_row - 1\n",
    "        \n",
    "        # Create a radar chart\n",
    "        chart = writer.book.add_chart({'type': 'radar'})\n",
    "        chart.add_series({\n",
    "            'name':       'Tertile 1',\n",
    "            'categories': ['Spider Charts', start_table_row, 0, end_table_row, 0],\n",
    "            'values':     ['Spider Charts', start_table_row, 1, end_table_row, 1],\n",
    "        })\n",
    "        chart.add_series({\n",
    "            'name':       'Tertile 3',\n",
    "            'categories': ['Spider Charts', start_table_row, 0, end_table_row, 0],\n",
    "            'values':     ['Spider Charts', start_table_row, 2, end_table_row, 2],\n",
    "        })\n",
    "        chart.set_title({'name': f\"{country} - Spider Chart\"})\n",
    "        chart.set_style(10)\n",
    "        \n",
    "        charts_ws.insert_chart(\n",
    "            current_row - (end_table_row - start_table_row + 1),\n",
    "            4,\n",
    "            chart,\n",
    "            {'x_offset': 25, 'y_offset': 10}\n",
    "        )\n",
    "        current_row += 5\n",
    "    \n",
    "    ########################################################################\n",
    "    # 3) Create an additional sheet for the requested comparison tables\n",
    "    ########################################################################\n",
    "    comp_ws = writer.book.add_worksheet(\"Comparison Tables\")\n",
    "    writer.sheets[\"Comparison Tables\"] = comp_ws\n",
    "    current_row = 0\n",
    "    \n",
    "    # Define the row labels for averages and the column names for the new table\n",
    "    avg_rows = [\"Overall\", \"Non-selling\", \"Tertile 1\", \"Tertile 2\", \"Tertile 3\"]\n",
    "    comp_cols = [\n",
    "        \"Food Consumption Score\",\n",
    "        \"Resilience Index Score\",\n",
    "        \"Is Empowered\",\n",
    "        \"Has Off-farm employment\",\n",
    "        \"Is Asset Poor\"\n",
    "    ]\n",
    "    # Mapping from new table columns to our indicator names\n",
    "    col_to_indicator = {\n",
    "        \"Food Consumption Score\": \"FCS\",\n",
    "        \"Resilience Index Score\": \"RCI\",\n",
    "        \"Is Empowered\": \"EMP\",\n",
    "        \"Has Off-farm employment\": \"Employment\",\n",
    "        \"Is Asset Poor\": \"Poor\"\n",
    "    }\n",
    "    \n",
    "    for country, data in results.items():\n",
    "        comp_ws.write(current_row, 0, f\"Country: {country}\")\n",
    "        current_row += 1\n",
    "        \n",
    "        # Build a dictionary of row labels -> {column_name -> value}\n",
    "        table_rows = {}\n",
    "        group_keys = {\n",
    "            \"Overall\": \"Overall\",\n",
    "            \"Non-selling\": \"k=0\",\n",
    "            \"Tertile 1\": \"k=1\",\n",
    "            \"Tertile 2\": \"k=2\",\n",
    "            \"Tertile 3\": \"k=3\"\n",
    "        }\n",
    "        \n",
    "        for label in avg_rows:\n",
    "            grp_key = group_keys[label]\n",
    "            row_vals = {}\n",
    "            for col in comp_cols:\n",
    "                indicator_key = col_to_indicator[col]\n",
    "                val = data['Raw'].get(grp_key, {}).get(indicator_key, np.nan)\n",
    "                row_vals[col] = round(val, 1) if pd.notna(val) else \"\"\n",
    "            table_rows[label] = row_vals\n",
    "        \n",
    "        # Convert to DataFrame\n",
    "        table_df = pd.DataFrame.from_dict(table_rows, orient='index', columns=comp_cols)\n",
    "        \n",
    "        # Now add two rows for t-test difference results, referencing the *renamed* groups\n",
    "        # The T-tests data frames have Group1/Group2 as \"Non-selling\", \"Tertile 1\", etc.\n",
    "        \n",
    "        # 1) Difference between Tertile 3 and Non-selling\n",
    "        diff_row_1 = {}\n",
    "        for col in comp_cols:\n",
    "            indicator_key = col_to_indicator[col]\n",
    "            df_tt = data['TTests'][indicator_key]\n",
    "            if not df_tt.empty:\n",
    "                # Look for row comparing 'Non-selling' and 'Tertile 3'\n",
    "                row_match = df_tt[\n",
    "                    (df_tt['Group1'] == 'Non-selling') & (df_tt['Group2'] == 'Tertile 3')\n",
    "                ]\n",
    "                if not row_match.empty:\n",
    "                    p_val = row_match.iloc[0]['p-value']\n",
    "                    diff_row_1[col] = \"Yes\" if p_val <= 0.05 else \"No\"\n",
    "                else:\n",
    "                    # If not found in that order, check the reverse\n",
    "                    # (unlikely due to the way combinations are generated, but just in case)\n",
    "                    row_match_rev = df_tt[\n",
    "                        (df_tt['Group1'] == 'Tertile 3') & (df_tt['Group2'] == 'Non-selling')\n",
    "                    ]\n",
    "                    if not row_match_rev.empty:\n",
    "                        p_val = row_match_rev.iloc[0]['p-value']\n",
    "                        diff_row_1[col] = \"Yes\" if p_val <= 0.05 else \"No\"\n",
    "                    else:\n",
    "                        diff_row_1[col] = \"\"\n",
    "            else:\n",
    "                diff_row_1[col] = \"\"\n",
    "        \n",
    "        # 2) Difference between Tertile 3 and Tertile 1\n",
    "        diff_row_2 = {}\n",
    "        for col in comp_cols:\n",
    "            indicator_key = col_to_indicator[col]\n",
    "            df_tt = data['TTests'][indicator_key]\n",
    "            if not df_tt.empty:\n",
    "                row_match = df_tt[\n",
    "                    (df_tt['Group1'] == 'Tertile 1') & (df_tt['Group2'] == 'Tertile 3')\n",
    "                ]\n",
    "                if not row_match.empty:\n",
    "                    p_val = row_match.iloc[0]['p-value']\n",
    "                    diff_row_2[col] = \"Yes\" if p_val <= 0.05 else \"No\"\n",
    "                else:\n",
    "                    # Check reverse\n",
    "                    row_match_rev = df_tt[\n",
    "                        (df_tt['Group1'] == 'Tertile 3') & (df_tt['Group2'] == 'Tertile 1')\n",
    "                    ]\n",
    "                    if not row_match_rev.empty:\n",
    "                        p_val = row_match_rev.iloc[0]['p-value']\n",
    "                        diff_row_2[col] = \"Yes\" if p_val <= 0.05 else \"No\"\n",
    "                    else:\n",
    "                        diff_row_2[col] = \"\"\n",
    "            else:\n",
    "                diff_row_2[col] = \"\"\n",
    "        \n",
    "        table_df.loc[\"Difference between tertile 3 and non-selling\"] = diff_row_1\n",
    "        table_df.loc[\"Difference between tertile 3 and tertile 1\"] = diff_row_2\n",
    "        \n",
    "        # Write the table to the Comparison Tables sheet\n",
    "        table_df.to_excel(\n",
    "            writer,\n",
    "            sheet_name=\"Comparison Tables\",\n",
    "            startrow=current_row,\n",
    "            startcol=0\n",
    "        )\n",
    "        current_row += table_df.shape[0] + 3\n",
    "\n",
    "print(\"Excel file created at:\")\n",
    "print(os.path.abspath(excel_filename))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42108450",
   "metadata": {},
   "source": [
    "Appendix D"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd5c6232",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ---------------------------------------------------------------\n",
    "# incindex_panel_plots.py      •  2025-05-08  (strict significance)\n",
    "# ---------------------------------------------------------------\n",
    "import pathlib, numpy as np, pandas as pd, matplotlib.pyplot as plt\n",
    "from linearmodels.panel import PanelOLS\n",
    "\n",
    "# ───── user settings ───────────────────────────────────────────\n",
    "ALPHA = 0.099        # significance threshold for every polynomial term\n",
    "SHOW_SKIPPED = True # print message when a spec is skipped\n",
    "\n",
    "# ───── 1.  PANEL DATA FILES ────────────────────────────────────\n",
    "panels = {\n",
    "    \"Malawi\"   : r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\MalPanel-5.dta\",\n",
    "    \"Ethiopia\" : r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\EthPanel-5.dta\",\n",
    "    \"Uganda\"   : r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\UgaPanel-6.dta\",\n",
    "    \"Tanzania\" : r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\TanPanel-5.dta\",\n",
    "    \"Nigeria\"  : r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Data Fred\\Panel\\NigPanel-6.dta\",\n",
    "}\n",
    "\n",
    "RESULTS_DIR = pathlib.Path(\n",
    "    r\"C:\\Users\\andre\\OneDrive\\Desktop\\BMGF\\LSMS ISA\\MAP\\Results\\incindex_panel_plots\"\n",
    ")\n",
    "RESULTS_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "# ───── 2.  CONTROL SETS ────────────────────────────────────────\n",
    "BASE_CTRL = [\"agehead\", \"female\", \"ac_workforce\", \"dependency_ratio\",\n",
    "             \"hhsize\", \"NonfIncShare\", \"overall_shock\"]\n",
    "\n",
    "CTRL_FARM = BASE_CTRL + [\"sell_msme\"]\n",
    "CTRL_CCI  = BASE_CTRL + [\"ac_ent\", \"farmsize\"]\n",
    "\n",
    "ENTITY, TIME, CLUST = \"id\", \"t\", \"cl\"\n",
    "OUTCOME             = \"incindex3_z\"\n",
    "\n",
    "LABELS = {\"farmsize\": \"Farm size (ha)\",\n",
    "          \"cci\"     : \"Share commercialized\"}\n",
    "\n",
    "plt.style.use(\"default\")\n",
    "GRID_KW = dict(which=\"major\", linestyle=\":\", linewidth=0.6, color=\"lightgrey\")\n",
    "\n",
    "# ───── helpers ────────────────────────────────────────────────\n",
    "def add_polynomials(df: pd.DataFrame, x: str, degree: int) -> pd.DataFrame:\n",
    "    if degree >= 2:\n",
    "        df[f\"{x}2\"] = df[x] ** 2\n",
    "    if degree == 3:\n",
    "        df[f\"{x}3\"] = df[x] ** 3\n",
    "    return df\n",
    "\n",
    "def derivative(b1, b2, b3, grid, degree):\n",
    "    return b1 + 2*b2*grid + (3*b3*grid**2 if degree == 3 else 0)\n",
    "\n",
    "def all_significant(res, x, degree) -> bool:\n",
    "    \"\"\"Return True only if *every* polynomial term is significant.\"\"\"\n",
    "    terms = [x] + ([f\"{x}2\"] if degree >= 2 else []) + ([f\"{x}3\"] if degree == 3 else [])\n",
    "    return all(res.pvalues.get(t, 1.0) < ALPHA for t in terms)\n",
    "\n",
    "# ───── 3.  CORE FUNCTION ──────────────────────────────────────\n",
    "# ───── 3.  CORE FUNCTION ──────────────────────────────────────\n",
    "def run_plot(df: pd.DataFrame, *, x, x_label, controls, name, degree,\n",
    "             dump_txt: pathlib.Path, png_path: pathlib.Path,\n",
    "             grid_bounds=None, tag=\"\"):\n",
    "\n",
    "    # ── 1.  Normalise outcome and build polynomials ───────────\n",
    "    df[OUTCOME] = (df[\"incindex3\"] - df[\"incindex3\"].mean()) / df[\"incindex3\"].std(ddof=0)\n",
    "    df = add_polynomials(df, x, degree)\n",
    "\n",
    "    # auxiliary dummy\n",
    "    df[\"sell_msme\"] = ((df[\"ac_ent\"] == 1) |\n",
    "                       ((df[\"cci\"] > 0.01) & df[\"cci\"].notna())).astype(int)\n",
    "\n",
    "    poly_terms = [x] + ([f\"{x}2\"] if degree >= 2 else []) + ([f\"{x}3\"] if degree == 3 else [])\n",
    "    keep = [OUTCOME, ENTITY, TIME, CLUST] + controls + poly_terms\n",
    "    df   = df.dropna(subset=keep)\n",
    "\n",
    "    # ── 2.  Panel regression ──────────────────────────────────\n",
    "    df_pan = df.set_index([ENTITY, TIME])\n",
    "    rhs    = \" + \".join(poly_terms + controls) + \" + EntityEffects + TimeEffects\"\n",
    "    res    = PanelOLS.from_formula(f\"{OUTCOME} ~ 1 + {rhs}\", data=df_pan) \\\n",
    "                     .fit(cov_type=\"clustered\", clusters=df_pan[CLUST])\n",
    "\n",
    "    # ── 3.  Strict significance filter ────────────────────────\n",
    "    if not all_significant(res, x, degree):\n",
    "        if SHOW_SKIPPED:\n",
    "            rel = \"quadratic\" if degree == 2 else \"cubic\"\n",
    "            print(f\"×  {name} {x}{tag} ({rel}): one or more p≥{ALPHA:.2g} — skipped\")\n",
    "        return\n",
    "\n",
    "    # save regression table\n",
    "    dump_txt.write_text(res.summary.as_text(), encoding=\"utf-8\")\n",
    "\n",
    "    # ── 4.  Prediction grid ───────────────────────────────────\n",
    "    lo, hi = np.percentile(df[x], [1, 99]) if not grid_bounds else grid_bounds\n",
    "    grid   = np.linspace(lo, hi, 201)\n",
    "\n",
    "    base_id, base_t = df[ENTITY].mode()[0], df[TIME].min()\n",
    "    X_pred = pd.DataFrame(0.0,\n",
    "        index=pd.MultiIndex.from_arrays([np.repeat(base_id, grid.size),\n",
    "                                         np.repeat(base_t, grid.size)],\n",
    "                                         names=[ENTITY, TIME]),\n",
    "        columns=res.params.index)\n",
    "\n",
    "    X_pred[x] = grid\n",
    "    if degree >= 2: X_pred[f\"{x}2\"] = grid**2\n",
    "    if degree == 3: X_pred[f\"{x}3\"] = grid**3\n",
    "    for c in controls:\n",
    "        if c in X_pred.columns:\n",
    "            X_pred[c] = df[c].mean()\n",
    "    if \"Intercept\" in X_pred.columns:\n",
    "        X_pred[\"Intercept\"] = 1.0\n",
    "\n",
    "    yhat = (X_pred @ res.params).values\n",
    "    se   = np.sqrt(np.einsum(\"ij,jk,ik->i\", X_pred, res.cov.values, X_pred))\n",
    "    ci_lo, ci_hi = yhat - 1.96*se, yhat + 1.96*se\n",
    "\n",
    "    # derivative for marginal‐effect line\n",
    "    b1 = res.params[x]\n",
    "    b2 = res.params.get(f\"{x}2\", 0)\n",
    "    b3 = res.params.get(f\"{x}3\", 0)\n",
    "    dydx = derivative(b1, b2, b3, grid, degree)\n",
    "\n",
    "    # ── 5.  Locate peak ───────────────────────────────────────\n",
    "    peak_idx = yhat.argmax()\n",
    "    peak_x   = grid[peak_idx]\n",
    "    peak_y   = yhat[peak_idx]\n",
    "\n",
    "    # ── 6.  Plot ──────────────────────────────────────────────\n",
    "    fig, ax1 = plt.subplots(figsize=(6.4, 4.5))         # extra width\n",
    "    ax1.fill_between(grid, ci_lo, ci_hi, color=\"lightgrey\", alpha=0.6,\n",
    "                     label=\"95 % CI\")\n",
    "    ax1.plot(grid, yhat, lw=2, label=\"Expected Inclusion index\")\n",
    "    ax1.plot(peak_x, peak_y, \"o\", ms=6, color=\"red\", zorder=5,\n",
    "             label=f\"Peak = {peak_x:.2f}\")\n",
    "    ax1.annotate(f\"{peak_x:.2f}\", (peak_x, peak_y),\n",
    "                 xytext=(4, 4), textcoords=\"offset points\",\n",
    "                 fontsize=8, color=\"red\")\n",
    "\n",
    "    ax1.set_xlabel(x_label)\n",
    "    ax1.set_ylabel(\"Inclusion index (z-score)\")\n",
    "    ax1.grid(**GRID_KW)\n",
    "\n",
    "    ax2 = ax1.twinx()\n",
    "    ax2.plot(grid, dydx, ls=\"--\", lw=2, label=\"Marginal effect\", color=\"green\")\n",
    "    ax2.set_ylabel(f\"∂ Inclusion index / ∂ {x_label.lower()}\", color=\"green\")\n",
    "    ax2.tick_params(axis=\"y\", labelcolor=\"green\")\n",
    "\n",
    "    # combined legend\n",
    "        # combined legend  (moved inside axes, upper-left)\n",
    "    lines, labels = [], []\n",
    "    for ax in (ax1, ax2):\n",
    "        L, lab = ax.get_legend_handles_labels()\n",
    "        lines += L; labels += lab\n",
    "    #ax1.legend(lines, labels, loc=\"upper left\", frameon=False)\n",
    "\n",
    "    # give extra head-room\n",
    "    fig.tight_layout(rect=[0, 0, 1, 0.90])\n",
    "\n",
    "    # ── new, concise two-line title ────────────────────────────\n",
    "    relation = \"Quadratic\" if degree == 2 else \"Cubic\"\n",
    "    ax1.set_title(\n",
    "        f\"{relation} relationship\\nShare commercialized  →  Inclusion Index ({name})\",\n",
    "        fontsize=10,\n",
    "        pad=18\n",
    "    )\n",
    "\n",
    "\n",
    "    fig.savefig(png_path, dpi=300); plt.close(fig)\n",
    "    print(f\"✓  {png_path.name}\")\n",
    "\n",
    "# ───── 4.  MAIN LOOP ───────────────────────────────────────────\n",
    "for name, path in panels.items():\n",
    "    print(f\"\\n→ {name}: {path}\")\n",
    "    df0 = pd.read_stata(path, convert_categoricals=False)\n",
    "    df0 = df0[df0[\"SSP\"] == 1].copy()\n",
    "\n",
    "\n",
    "    # prep vars\n",
    "    df0[\"cci\"] = df0[\"cci\"].fillna(0)\n",
    "    if \"qs\" not in df0.columns:\n",
    "        raise KeyError(\"'qs' missing in panel data.\")\n",
    "\n",
    "    # full-sample models\n",
    "    for deg in (2, 3):\n",
    "        run_plot(df0.copy(), x=\"farmsize\", x_label=LABELS[\"farmsize\"],\n",
    "                 controls=CTRL_FARM, name=name, degree=deg,\n",
    "                 dump_txt=RESULTS_DIR / f\"{name.lower()}_farmsize_{'quad' if deg==2 else 'cubic'}_reg.txt\",\n",
    "                 png_path=RESULTS_DIR / f\"{name.lower()}_farmsize_{'quad' if deg==2 else 'cubic'}.png\",\n",
    "                 grid_bounds=(0, 40))\n",
    "\n",
    "        run_plot(df0.copy(), x=\"cci\", x_label=LABELS[\"cci\"],\n",
    "                 controls=CTRL_CCI, name=name, degree=deg,\n",
    "                 dump_txt=RESULTS_DIR / f\"{name.lower()}_cci_{'quad' if deg==2 else 'cubic'}_reg.txt\",\n",
    "                 png_path=RESULTS_DIR / f\"{name.lower()}_cci_{'quad' if deg==2 else 'cubic'}.png\")\n",
    "\n",
    "    # SSP-only (farm-size) models\n",
    "    if \"SSP\" not in df0.columns:\n",
    "        raise KeyError(\"`SSP` variable missing (should flag small-scale producers).\")\n",
    "\n",
    "    df_ssp = df0[df0[\"SSP\"] == 1].copy()\n",
    "    for deg in (2, 3):\n",
    "        run_plot(df_ssp, x=\"farmsize\", x_label=LABELS[\"farmsize\"],\n",
    "                 controls=CTRL_FARM, name=name, degree=deg,\n",
    "                 dump_txt=RESULTS_DIR / f\"{name.lower()}_farmsize_ssp1_{'quad' if deg==2 else 'cubic'}_reg.txt\",\n",
    "                 png_path=RESULTS_DIR / f\"{name.lower()}_farmsize_ssp1_{'quad' if deg==2 else 'cubic'}.png\",\n",
    "                 grid_bounds=(0, 40), tag=\"_ssp1\")\n",
    "\n",
    "print(\"\\nFinished.  Outputs saved in:\", RESULTS_DIR)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "geo_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
